In [1]:
from IPython.display import display
In [2]:
import numpy
import pandas
pandas.options.display.float_format = '{:,.2f}'.format
import os
import seaborn as sns
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.5})
sns.set_style("whitegrid", rc={"lines.linewidth": .5})
import re
from sklearn import metrics
In [3]:
ITERATIONS = ['ENTITY', 'LKIF', 'YAGO']
In [4]:
FOLDS = 5
In [5]:
averages = ['micro', 'macro', 'weighted']
In [6]:
def get_input_files(input_dirpath, pattern):
"""Returns the names of the files in input_dirpath that matches pattern."""
all_files = os.listdir(input_dirpath)
result = []
for filename in all_files:
if re.match(pattern, filename) and os.path.isfile(os.path.join(
input_dirpath, filename)):
result.append(os.path.join(input_dirpath, filename))
return result
In [7]:
def add_prec_rec(predictions, results, dataset_name, iteration):
for average in averages:
values = metrics.precision_recall_fscore_support(
predictions.true, predictions.prediction,
average=average, warn_for=()
)[:3]
columns = ['Prec({})'.format(average), 'Rec({})'.format(average), 'F1({})'.format(average)]
results.loc[dataset_name, iteration][columns] = values
def add_top_bottom(predictions, results, dataset_name, iteration):
if iteration == 'NER':
return
values = metrics.precision_recall_fscore_support(predictions.true, predictions.prediction,
average=None, warn_for=())
values = pandas.DataFrame(numpy.vstack(values).T, columns=['Prec', 'Recall', 'F1score', 'Support'])
to_take = int(values.shape[0] * 0.2) # 20%
top_values = values.sort_values('Support', ascending=False)[:to_take].mean().values[:3]
results.loc[dataset_name, iteration][
['Top 20% Prec', 'Top 20% Rec', 'Top 20% F1']] = top_values
bottom_values = values.sort_values('Support', ascending=True)[:to_take].mean().values[:3]
results.loc[dataset_name, iteration][
['Bottom 20% Prec', 'Bottom 20% Rec', 'Bottom 20% F1']] = bottom_values
In [8]:
result_columns = ['Acc', 'Prec(macro)', 'Rec(macro)', 'F1(macro)', 'Prec(micro)', 'Rec(micro)', 'F1(micro)',
'Prec(weighted)', 'Rec(weighted)', 'F1(weighted)',
'Top 20% Prec', 'Bottom 20% Prec', 'Top 20% Rec', 'Bottom 20% Rec', 'Top 20% F1', 'Bottom 20% F1']
important_columns = ['Acc', 'Prec(macro)', 'Rec(macro)', 'F1(macro)']
def get_results(base_filename, dataset_names, base_dirname, iterations=ITERATIONS):
index = pandas.MultiIndex.from_product([iterations, averages], names=['Task', 'Average'])
results = pandas.DataFrame(
columns=result_columns,
index=pandas.MultiIndex.from_product([[x[0] for x in dataset_names], iterations], names=['Dataset', 'Task']))
for dataset_name, dataset_description in dataset_names:
prediction_dirname = base_dirname.format(dataset_name=dataset_name)
for iteration in iterations:
filenames = get_input_files(prediction_dirname, base_filename.format(iteration))
if len(filenames) is 0:
print('Error with iteration {} and dataset {}. {}'.format(
iteration, dataset_name, os.path.join(prediction_dirname, base_filename.format(iteration))))
continue
filename = filenames[0]
predictions = pandas.read_csv(filename)
predictions.replace('I-I', 'I', inplace=True)
results.loc[dataset_name, iteration]['Acc'] = metrics.accuracy_score(
predictions.true, predictions.prediction)
add_prec_rec(predictions, results, dataset_name, iteration)
add_top_bottom(predictions, results, dataset_name, iteration)
return results
In [9]:
def get_fold_results(prediction_base_filename, dataset_names, base_dirname, iterations=ITERATIONS):
test_fold_results = []
for fold in range(FOLDS):
test_fold_results.append(get_results(prediction_base_filename, dataset_names,
base_dirname.format(i=fold + 1), iterations=iterations))
return test_fold_results
In [10]:
def highlight_max(data, color='yellow'):
'''
highlight the maximum in a Series or DataFrame
'''
attr = 'background-color: {}'.format(color)
if data.ndim == 1: # Series from .apply(axis=0) or axis=1
is_max = data == data.max()
return [attr if v else '' for v in is_max]
else: # from .apply(axis=None)
is_max = data == data.max().max()
return pandas.DataFrame(numpy.where(is_max, attr, ''),
index=data.index, columns=data.columns)
In [11]:
def get_results_by_task(results, iterations=ITERATIONS):
results_by_task = results[important_columns].swaplevel()
for task in iterations:
print(task)
display(results_by_task.loc[task])
In [12]:
def plot_tasks(column_name, dataframe):
if isinstance(dataframe, list):
data = pandas.concat([x[[column_name]].reset_index() for x in dataframe])
else:
data = dataframe.reset_index()
ax = sns.pointplot(x="Dataset", y=column_name, hue="Task", data=data, palette="Set2")
sns.plt.show()
def plot_datasets(column_name, dataframe):
if isinstance(dataframe, list):
data = pandas.concat([x[[column_name]].reset_index() for x in dataframe])
else:
data = dataframe.reset_index()
ax = sns.pointplot(x="Task", y=column_name, hue="Dataset", data=data, palette="Set2")
sns.plt.show()
In [13]:
def compare_iterations(column_name, dataframes, iterations=ITERATIONS):
metric_values = pandas.concat([x[[column_name]].reset_index() for x in dataframes], keys=range(FOLDS))
metric_values.index = metric_values.index.droplevel(1)
metric_values.index.name = 'Fold'
metric_values = metric_values.reset_index()
ax = sns.factorplot(x="Task", y=column_name, hue="Fold", col="Dataset", data=metric_values,
palette="Set2", col_wrap=2)
sns.plt.show()
In [30]:
def replace_dataframe_values(dataframe):
replacements = {
# Task names
'entity': 'NERC', 'yago': 'YAGO', 'lkif': 'LKIF', 'ENTITY': 'NERC',
# Dataset names
'handcrafted': '(E)MLP', 'wv_echr': '(E)MLP+WV echr', 'wv_wiki': '(E)MLP+WV wiki',
'wv_mixed': '(E)MLP+WV mix',
'handcrafted_wiki_classifier': '(W)MLP', 'wv_echr_wiki_classifier': '(W)MLP+WV echr',
'wv_wiki_wiki_classifier': '(W)MLP+WV wiki', 'wv_mixed_wiki_classifier': '(W)MLP+WV mix',
# Metrics
'Prec(macro)': 'Precision (macro)', 'Rec(macro)': 'Recall (macro)',
}
for old_value, new_value in replacements.items():
dataframe.replace(old_value, new_value, inplace=True)
In [31]:
def plot_datasets_all_tasks(dataframes, columns=important_columns):
metric_values = pandas.concat([x[columns].stack().reset_index() for x in dataframes])
metric_values = metric_values.rename(columns={'level_2': 'Metric', 0: 'Value'})
replace_dataframe_values(metric_values)
ax = sns.factorplot(x="Task", y="Value", hue="Dataset", col="Metric", data=metric_values,
palette="Set2", col_wrap=len(columns), markers=['o'] * 4 + ['v'] * 4, aspect=1.25)
sns.plt.show()
def plot_tasks_all_datasets(dataframes, columns=important_columns):
metric_values = pandas.concat([x[columns].stack().reset_index() for x in dataframes])
metric_values = metric_values.rename(columns={'level_2': 'Metric', 0: 'Value'})
replace_dataframe_values(metric_values)
ax = sns.factorplot(x="Dataset", y="Value", hue="Task", col="Metric", data=metric_values,
palette="Set2", col_wrap=len(columns))
ax.set_xticklabels(rotation=30)
sns.plt.show()
In [32]:
plot_datasets_all_tasks(all_evaluation_results , columns=['Prec(macro)', 'Rec(macro)'])
In [33]:
for x in all_evaluation_results:
x.sort_index(inplace=True)
indexer = [slice(None)]*len(all_evaluation_results[0].index.names)
selected_datasets = [
'handcrafted', 'wv_wiki', 'handcrafted_wiki_classifier', 'wv_mixed_wiki_classifier'
]
indexer[all_evaluation_results[0].index.names.index('Dataset')] = selected_datasets
to_graphic = [res.loc[tuple(indexer),:] for res in all_evaluation_results]
plot_datasets_all_tasks(to_graphic, columns=['Prec(macro)', 'Rec(macro)'])
In [107]:
dataset_names = [
('handcrafted', 'Handcrafted Features'),
('wv_echr', 'Word vectors trained with the ECHR documents'),
('wv_wiki', 'Word vectors trained with the Wikipedia documents'),
('wv_mixed', 'Word vectors trained with documents from Wikipedia and ECHR'),
('wv_google', 'Word vectors trained with documents from Wikipedia and ECHR')
]
prediction_base_filename = r'test_predictions_.*{}.*csv'
base_dirname = '../results/echr/evaluations/iter{i}/train/{{dataset_name}}'
all_test_results = get_fold_results(prediction_base_filename, dataset_names, base_dirname)
In [105]:
In [109]:
plot_tasks_all_datasets(all_test_results)
In [108]:
plot_datasets_all_tasks(all_test_results)
In [108]:
compare_iterations('Acc', all_test_results)
In [28]:
dataset_names = [
('handcrafted', 'Handcrafted Features using the ECHR trained classifier'),
('wv_echr', 'Word vectors trained with the ECHR documents using the ECHR trained classifier'),
('wv_wiki', 'Word vectors trained with the Wikipedia documents using the ECHR trained classifier'),
('wv_mixed', 'Word vectors trained with documents from Wikipedia and ECHR using the ECHR trained classifier'),
('handcrafted_wiki_classifier', 'Handcrafted features using the Wikipedia trained classifier'),
('wv_wiki_wiki_classifier',
'Word vectors trained with the Wikipedia documents using the Wikipedia trained classifier'),
('wv_mixed_wiki_classifier', ''),
('wv_echr_wiki_classifier', '')
]
evaluation_base_filename = r'evaluation_predictions_.*{}.*csv'
evaluation_base_dirname = '../results/echr/evaluations/iter{i}/train/{{dataset_name}}'
all_evaluation_results = get_fold_results(evaluation_base_filename, dataset_names, evaluation_base_dirname)
In [29]:
((all_evaluation_results[0] + all_evaluation_results[1] + all_evaluation_results[4]) / 3.0)[important_columns]
Out[29]:
In [33]:
plot_tasks_all_datasets(all_evaluation_results, columns=['Prec(macro)', 'Rec(macro)'])
In [44]:
plot_datasets_all_tasks(all_evaluation_results , columns=['Prec(macro)', 'Rec(macro)'])
In [31]:
full_dataframes = [pandas.concat([x1, x2]) for x1, x2 in zip(all_evaluation_results, all_stanford_evaluation_results)]
plot_datasets_all_tasks(full_dataframes , columns=['Prec(macro)', 'Rec(macro)', 'F1(macro)'])
In [183]:
compare_iterations('F1(macro)', all_evaluation_results)
In [30]:
dataset_names = [
('stanford', ''),
('stanford_wiki_trained', ''),
]
evaluation_base_filename = r'.*{}-evaluation_results.csv'
evaluation_base_dirname = '../results/echr/evaluations/iter{i}/{{dataset_name}}'
stanford_iterations = ['entity', 'lkif', 'yago']
all_stanford_evaluation_results = get_fold_results(evaluation_base_filename, dataset_names, evaluation_base_dirname,
iterations=stanford_iterations)
In [264]:
((all_stanford_evaluation_results[0] + all_stanford_evaluation_results[1] + all_stanford_evaluation_results[4]) / 3.0)[important_columns]
Out[264]:
In [249]:
all_stanford_evaluation_results[0]
Out[249]:
In [88]:
dataset_names = [
('knn', 'Knn baseline'),
]
evaluation_base_filename = r'evaluation_predictions_{}.csv'
base_dirname = '../results/echr/evaluations/iter{i}/train/{dataset_name}'
knn_evaluation_results = get_fold_results(evaluation_base_filename, dataset_names, evaluation_base_dirname)
In [92]:
display((knn_evaluation_results[0] + knn_evaluation_results[1] + knn_evaluation_results[4])/3.0)
In [19]:
dataset_names = [
('wiki_wv_mixed', '')
]
prediction_base_filename = r'test_predictions_.*{}.*csv'
base_dirname = '../results/{dataset_name}'
test_results = get_results(prediction_base_filename, dataset_names, base_dirname)
In [20]:
test_results
Out[20]:
In [21]:
dataset_names = [
('word_vectors', ''),
('handcrafted', ''),
]
prediction_base_filename = r'test_predictions_.*{}.*csv'
base_dirname = '/home/ccardellino/datasets/predictions/{dataset_name}/batch'
test_results = get_results(prediction_base_filename, dataset_names, base_dirname)
In [22]:
test_results
Out[22]:
In [ ]: